PhD

The LaTeX sources of my Ph.D. thesis
git clone https://esimon.eu/repos/PhD.git
Log | Files | Refs | README | LICENSE

word2vec embeddings.xml (1704B)


      1 <!--
      2 import gensim.downloader
      3 import sklearn.decomposition
      4 from xml.etree import ElementTree
      5 TARGET_WORDS = ["Paris","France","Madrid","Spain","Italy","Rome","Germany","Berlin"]
      6 data = gensim.downloader.load('word2vec-google-news-300')
      7 source = data[TARGET_WORDS]
      8 pca = sklearn.decomposition.PCA(n_components=2, svd_solver='full')
      9 target = pca.fit_transform(source)
     10 root = ElementTree.Element("embeddings")
     11 for word, vector in zip(TARGET_WORDS, target):
     12     embedding = ElementTree.SubElement(root, "embedding")
     13     ElementTree.SubElement(embedding, "x").text = str(vector[0])
     14     ElementTree.SubElement(embedding, "y").text = str(vector[1])
     15     ElementTree.SubElement(embedding, "label").text = str(word)
     16 explained = ElementTree.SubElement(root, "explained")
     17 ElementTree.SubElement(explained, "x").text = str(pca.explained_variance_ratio_[0])
     18 ElementTree.SubElement(explained, "y").text = str(pca.explained_variance_ratio_[1])
     19 
     20 tree = ElementTree.ElementTree(root)
     21 tree.write("word2vec embeddings.xml")
     22 -->
     23 <embeddings><embedding><x>1.0263773</x><y>0.23883666</y><label>Paris</label></embedding><embedding><x>-0.947096</x><y>0.05958702</y><label>France</label></embedding><embedding><x>0.93604654</x><y>-1.3953391</y><label>Madrid</label></embedding><embedding><x>-0.873475</x><y>-1.1731068</y><label>Spain</label></embedding><embedding><x>-1.005623</x><y>-0.33860308</y><label>Italy</label></embedding><embedding><x>1.3307737</x><y>-0.19644451</y><label>Rome</label></embedding><embedding><x>-1.2115865</x><y>1.060732</y><label>Germany</label></embedding><embedding><x>0.7445842</x><y>1.7443377</y><label>Berlin</label></embedding><explained><x>0.27626586</x><y>0.25357923</y></explained></embeddings>